import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime as dt

## plt setting
plt.rc('font', size=14)
plt.style.use('seaborn-bright')

# Snopes data load
sn = pd.read_csv("./snopes.csv")

sn = sn[sn['page_type']=="Fact Check"] # Only select Fact Checks
sn['date_published']=pd.to_datetime(sn['date_published'])
sn['date_updated']=pd.to_datetime(sn['date_updated'])

sn['yearp'] = sn.apply(lambda row : row['date_published'].year, axis=1)
sn['monthp'] = sn.apply(lambda row : row['date_published'].month, axis=1)
sn['dayp'] = sn.apply(lambda row : row['date_published'].day, axis=1)

sn = sn[sn['date_published'].between(dt(2019,5,17),dt(2022,8,31))]
len(sn) # 5932

sn.rename(columns={'date_published':"fc_date",'yearp':'fc_year','monthp':'fc_month','dayp':'fc_day','author_name':'author'},inplace=True)
sn.columns

set(sn['rating'].to_list())
sn.loc[sn['rating']=="True","rating"] = "TRUE"
sn.loc[sn['rating']=="False","rating"] = "FALSE"
sn.loc[sn['rating']=="none"] = None

# Politifact data load
pf = pd.read_csv("./politifact.csv")
len(pf) #10710
pf['fc_date']=pd.to_datetime(pf['fc_date'])
pf['cdate']=pd.to_datetime(pf['cdate'])
pf = pf[pf['fc_date'].between(dt(2019,5,17),dt(2022,8,31))]
len(pf) #5806



# AAP data load
aap = pd.read_csv("./aap.csv")
aap.columns
len(aap) #843
aap['fc_date'] = pd.to_datetime(aap['fc_date'])
aap = aap[aap['fc_date'].between(dt(2019,5,17),dt(2022,8,31))] # 827
len(aap) #827

# AAP has multiple authors. split it. deliminater = ;
import re
for i in range(len(aap)):
    if aap['author'].iloc[i] == "AAP FactCheck" or aap['author'].iloc[i] == "FactCheck":
        aap['author'].iloc[i] = "AAP Factcheck"
    try:
        foo = re.split("and |& |,", aap['author'].iloc[i])
        aap['author'].iloc[i] = "; ".join(x.strip() for x in foo)
    except:
        pass


# Logically data load
lg = pd.read_csv("./logically.csv")
lg = lg[lg['location'] == "United States"] # only for us
len(lg) #993

# Date
def date_img(dataset):
    dname = []
    cname = []
    for yy in range(2019, 2023):
        for mm in range(1, 13):
            dname.append("{}-{}".format(yy, mm))
            print("{}-{}".format(yy, mm))
            try:
                cname.append(dataset.groupby(['fc_year', 'fc_month']).count()['link'][yy][mm])
            except:
                cname.append(0)
    return dname, cname

sndate, sncnt = date_img(sn)
pfdate, pfcnt = date_img(pf)
aapdate, aapcnt = date_img(aap)
lgdate, lgcnt = date_img(lg)
sndate = sndate[4:44]
sncnt = sncnt[4:44]
pfcnt = pfcnt[4:44]
aapcnt = aapcnt[4:44]
lgcnt = lgcnt[4:44]

plt.plot(sncnt,linestyle="-", marker=".", label = "Snopes (Total Fcs: %s)" %len(sn))
plt.plot(pfcnt,linestyle="--", marker="o", label = "Politifact (Total Fcs: %s)" %len(pf))
plt.plot(aapcnt,linestyle="-.", marker="v", label = "AAP (Total Fcs: %s)" %len(aap))
plt.plot(lgcnt,linestyle=":", marker="^", label = "Logically (Total Fcs: %s)" %len(lg))
plt.grid()
plt.legend(fontsize=9)
plt.ylim(-10,350)
plt.xticks(np.arange(0,len(sncnt),5), sndate[::5], rotation=90)
plt.subplots_adjust(bottom=0.3, top=0.95, right=0.95)
plt.xlabel("Date (yyyy-mm)")
plt.ylabel("Number of articles")
plt.show()
plt.close()

# authors
years_li = pf.groupby('fc_year').count().index.to_list()

def author_year(df):
    y = []
    for yy in years_li:
        print(str(yy) + ": # authors = " + str(
            len(df.groupby(['fc_year', 'author']).count().sort_values(['fc_year', 'link'], ascending=False).loc[yy])))
        y.append(
            len(df.groupby(['fc_year', 'author']).count().sort_values(['fc_year', 'link'], ascending=False).loc[yy]))
    x = [str(yy) for yy in years_li]
    return x,y

snx,sny = author_year(sn)
pfx,pfy = author_year(pf)
x_axis = np.arange(len(snx))

## Note: after lg subsetting only US, no 2019 articles. Thus, manually added 0 for 2019.
years_li = lg.groupby('fc_year').count().index.to_list()
lgx,lgy = author_year(lg)
lgy.insert(0,0)
lgx[0]='2019'

years_li = pf.groupby('fc_year').count().index.to_list()

for yy in years_li:
    foo = aap[aap['fc_year'] == yy]['author'].to_list()
    foo2 = []
    for i in range(len(foo)):
        try:
            foo2.extend(foo[i].split(";"))
        except:
            foo2.append(foo[i])
    print(set(foo2))
    print(len(set(foo2)))

aapx = years_li
aapy = [14,5,4,16]

aap_auli = []
for i in range(len(aap)):
    try:
        aap_auli.extend([x.strip() for x in aap['author'].iloc[i].split(";")])
    except:
        aap_auli.append(aap['author'].iloc[i])


plt.bar(x_axis - 0.3,sny,width=0.2,label="Snopes"+"(Total: %s)"%len(set(sn['author'])))
plt.bar(x_axis - 0.1,pfy,width=0.2,label="Politifact"+"(Total: %s)"%len(set(pf['author'])))
plt.bar(x_axis + 0.1,aapy,width=0.2,label="AAP"+"(Total: %s)"%len(set(aap_auli)))
plt.bar(x_axis + 0.3,lgy,width=0.2,label="Logically"+"(Total: %s)"%len(set(lg['author'])))
plt.xticks(np.arange(len(snx)),snx,rotation=90)
plt.legend(fontsize=9)
plt.subplots_adjust(bottom=0.2, top=0.95)
plt.ylabel("Number of authors ")
plt.xlabel("Year")
plt.show()
plt.close()

# Rating

set(sn['rating'])
set(pf['rating'])
set(lg['rating'])
set(aap['rating'])

sn['rating2'] = None
for i in range(len(sn)):
    if sn['rating'].iloc[i] == "FALSE":
        sn['rating2'].iloc[i] = "False"
    elif sn['rating'].iloc[i] == "Mostly False":
        sn['rating2'].iloc[i] = "Mostly False"
    elif sn['rating'].iloc[i] == "Mixture":
        sn['rating2'].iloc[i] = "Mixture"
    elif sn['rating'].iloc[i] == "Mostly True":
        sn['rating2'].iloc[i] = "Mostly True"
    elif sn['rating'].iloc[i] == "TRUE":
        sn['rating2'].iloc[i] = "True"
    else:
        sn['rating2'].iloc[i] = "Others"

pf['rating2'] = None
for i in range(len(pf)):
    if pf['rating'].iloc[i] in ("FALSE","pants-fire"):
        pf['rating2'].iloc[i] = "False"
    elif pf['rating'].iloc[i] == "barely-true":
        pf['rating2'].iloc[i] = "Mostly False"
    elif pf['rating'].iloc[i] == "half-true":
        pf['rating2'].iloc[i] = "Mixture"
    elif pf['rating'].iloc[i] == "mostly-true":
        pf['rating2'].iloc[i] = "Mostly True"
    elif pf['rating'].iloc[i] == "TRUE":
        pf['rating2'].iloc[i] = "True"
    else:
        pf['rating2'].iloc[i] = "Others"
set(pf['rating2'])


lg['rating2'] = None
for i in range(len(lg)):
    if lg['rating'].iloc[i] == "FALSE":
        lg['rating2'].iloc[i] = "False"
    elif lg['rating'].iloc[i] == "MISLEADING":
        lg['rating2'].iloc[i] = "Mostly False"
    elif lg['rating'].iloc[i] == "half-true":
        lg['rating2'].iloc[i] = "Mixture"
    elif lg['rating'].iloc[i] == "PARTLY TRUE":
        lg['rating2'].iloc[i] = "Mostly True"
    elif lg['rating'].iloc[i] == "TRUE":
        lg['rating2'].iloc[i] = "True"
    else:
        lg['rating2'].iloc[i] = "Others"
set(lg['rating2'])

aap['rating2'] = None
for i in range(len(aap)):
    if aap['rating'].iloc[i] == "FALSE":
        aap['rating2'].iloc[i] = "False"
    elif aap['rating'].iloc[i] == "mostly false":
        aap['rating2'].iloc[i] = "Mostly False"
    elif aap['rating'].iloc[i] == "mixture":
        aap['rating2'].iloc[i] = "Mixture"
    elif aap['rating'].iloc[i] == "mostly true":
        aap['rating2'].iloc[i] = "Mostly True"
    elif aap['rating'].iloc[i] == "TRUE":
        aap['rating2'].iloc[i] = "True"
    else:
        aap['rating2'].iloc[i] = "Others"
set(aap['rating2'])

def rating_img(df):
    x = ["False", "Mostly False", "Mixture", "Mostly True", "True", "Others"]
    y = [(df['rating2'] == z).sum() for z in x]
    return x,y

snx,sny = rating_img(sn)
pfx,pfy = rating_img(pf)
aapx,aapy = rating_img(aap)
lgx,lgy = rating_img(lg)
x_axis = np.arange(len(snx))

plt.bar(x_axis - 0.3, sny, width=0.2, label="Snopes" + "(Total: %s)" % len(sn))
plt.bar(x_axis - 0.1, pfy, width=0.2, label="Politifact" + "(Total: %s)" % len(pf))
plt.bar(x_axis + 0.1, aapy, width=0.2, label="AAP" + "(Total: %s)" % len(aap))
plt.bar(x_axis + 0.3, lgy, width=0.2, label="Logically" + "(Total: %s)" % len(lg))
plt.xticks(np.arange(len(snx)), snx, rotation=90)
plt.legend()
plt.subplots_adjust(bottom=0.35, top=0.95, left=0.15)
plt.ylabel("Number of articles")
plt.xlabel("Rating")
plt.show()
plt.close()

# Top10 Authors
import string
author_li = ["Author " + x for x in string.ascii_uppercase[0:10]]
sny = sn.groupby('author').count().sort_values(['link'],ascending=False)['link'].to_list()[0:10]
pfy = pf.groupby('author').count().sort_values(['link'],ascending=False)['link'].to_list()[0:10]
aapy = aap.groupby('author').count().sort_values(['link'],ascending=False)['link'].to_list()[0:10]
lgy = lg.groupby('author').count().sort_values(['link'],ascending=False)['link'].to_list()[0:10]
x_axis = np.arange(len(author_li))

plt.bar(x_axis - 0.3, sny, width=0.2, label="Snopes" + "(Total: %s)" % len(sn))
plt.bar(x_axis - 0.1, pfy, width=0.2, label="Politifact" + "(Total: %s)" % len(pf))
plt.bar(x_axis + 0.1, aapy, width=0.2, label="AAP" + "(Total: %s)" % len(aap))
plt.bar(x_axis + 0.3, lgy, width=0.2, label="Logically" + "(Total: %s)" % len(lg))
plt.xticks(np.arange(len(author_li)), author_li, rotation=90)
plt.legend()
plt.subplots_adjust(bottom=0.3, top=0.95, left=0.15, right=0.95)
plt.ylabel("Number of articles")
plt.xlabel("The 10 most prolific authors")
plt.show()
plt.close()

# Ratings of Top 10 authors
def author_dic(df,author_num):
    rating_li = ['False', 'Mostly False', 'Mixture', 'Mostly True', 'True', 'Others']
    y = df.groupby(['author']).count().sort_values(['link'], ascending=False)['link'][:author_num].to_list()
    x = df.groupby(['author']).count().sort_values(['link'], ascending=False)[:author_num].index.tolist()

    df_aurating = pd.DataFrame()
    df_aurating['dummy'] = range(6)
    for nn in x:
        df_aurating['%s' % nn] = pd.Series(
            df.groupby(['author', 'rating2']).count().sort_values(['author', 'link'], ascending=False).loc[nn]['link'][
            :].index.get_level_values('rating2'))
        df_aurating['%s_v' % nn] = pd.Series(
            df.groupby(['author', 'rating2']).count().sort_values(['author', 'link'], ascending=False).loc[nn]['link'][
            :].array)  # since this is Series, .array is needed to add as column in df.
        df_aurating['%s_p' % nn] = pd.Series((df.groupby(['author', 'rating2']).count().sort_values(['author', 'link'],
                                                                                                    ascending=False).loc[
                                                  nn]['link'][:].array /
                                              df.groupby(['author', 'rating2']).count().sort_values(['author', 'link'],
                                                                                                    ascending=False).loc[
                                                  nn]['link'][:].array.sum()) * 100)

    dic_rat = {}
    for name in x:
        dic_rat[name] = []
        for rat in rating_li:
            dic_rat[name].append(df[(df['author'] == name) & (df['rating2'] == rat)].count()[0])

    for name in x:
        dic_rat[name + "_p"] = []
        for rat in rating_li:
            dic_rat[name + "_p"].append(round((df[(df['author'] == name) & (df['rating2'] == rat)].count()[0]) / (
            df[df['author'] == name].count()[0]) * 100, 1))
    return dic_rat

sn_dic = author_dic(sn,10)
pf_dic = author_dic(pf,10)
aap_dic = author_dic(aap,10)
lg_dic = author_dic(lg,10)

def ar_img_per(df_dic):
    # Create a color palette
    palette = plt.get_cmap('Paired')

    import string
    author_li2 = ["Author " + x for x in string.ascii_uppercase[0:10]]
    author_li = list(df_dic.keys())[0:10]
    rating_li = ['False', 'Mostly False', 'Mixture', 'Mostly True', 'True', 'Others']
    # PercentageL top 10 authors FCs by rating
    num = 0
    for name in author_li:
        plt.plot(rating_li, df_dic[name + "_p"], marker='', color=palette(num), linewidth=1, alpha=0.9,
                 label=author_li2[num])
        num += 1

    plt.xlabel("Rating")
    plt.ylabel("Percentage (%)")
    # Show the graph
    plt.legend(fontsize=8)
    plt.ylim([0, 100])
    plt.xticks(np.arange(len(rating_li)), rating_li, rotation=90)
    plt.subplots_adjust(bottom=0.35, right=0.95, top=0.95)
    plt.show()
    plt.close()


def ar_img_abs(df_dic):
    # Create a color palette
    palette = plt.get_cmap('Paired')

    import string
    author_li2 = ["Author " + x for x in string.ascii_uppercase[0:10]]
    author_li = list(df_dic.keys())[0:10]
    rating_li = ['False', 'Mostly False', 'Mixture', 'Mostly True', 'True', 'Others']
    num = 0
    for name in author_li:
        plt.plot(rating_li, df_dic[name], marker='', color=palette(num), linewidth=1, alpha=0.9, label=author_li2[num])
        num += 1
    plt.xlabel("Rating")
    plt.ylabel("Number of articles")
    plt.legend(fontsize=8)
    plt.xticks(np.arange(len(rating_li)), rating_li, rotation=90)
    plt.subplots_adjust(bottom=0.35, top=0.90, right=0.95)
    plt.show()
    plt.close()

ar_img_per(sn_dic) # Can generate other fact-checkers' images as well

# Claim Similarity Comparison
sn.columns
pf.columns
lg.columns
aap.columns

statements = pd.concat([aap['title'],pf['claim'],sn['claim'],lg['title']],ignore_index=True)
len(statements) == len(aap) + len(pf) + len(sn) + len(lg)
statements.isnull().sum()

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(statements)
arr = X.toarray()
arr_aap = arr[:len(aap)]
arr_pf = arr[len(aap):len(aap)+len(pf)]
arr_sn = arr[len(aap)+len(pf):len(aap)+len(pf)+len(sn)]
arr_lg = arr[len(aap)+len(pf)+len(sn):]

len(arr_aap)==len(aap)
len(arr_pf)==len(pf)
len(arr_sn)==len(sn)
len(arr_lg)==len(lg)

''' Evaluation'''
from sklearn.metrics.pairwise import cosine_similarity

def sim(arr1,arr2,cri):
    tfidf_sim = cosine_similarity(arr1,arr2)
    x = cri
    y1 = [any(y>x for y in tf) for tf in tfidf_sim]
    y2 = [any(y>x for y in tf) for tf in tfidf_sim.T]
    return (sum(y1)/len(arr1))*100, (sum(y2)/len(arr2))*100

# sn vs. pf
sim(arr_sn,arr_pf,0.5)

# sn vs. aap
sim(arr_sn,arr_aap,0.5)

# sn vs. lg
sim(arr_sn,arr_lg,0.5)

# pf vs. aap
sim(arr_pf,arr_aap,0.5)

# pf vs. lg
sim(arr_pf,arr_lg,0.5)

# aap vs. lg
sim(arr_aap,arr_lg,0.5)